This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Thu Jan 30 13:26:30 2025.
Data Description:
This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.
Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Relevant Paper:
Fanaee-T, Hadi, and Gama, Joao. Event labeling combining ensemble detectors and background knowledge, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg
## Import required packages
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggplot2")
library("dbplyr")
##
## Attaching package: 'dbplyr'
##
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library("timetk")
library("tseries")
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library("forecast")
library('plotly')
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
data("bike_sharing_daily")
day_data <- bike_sharing_daily
str(day_data)
## spc_tbl_ [731 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ instant : num [1:731] 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : Date[1:731], format: "2011-01-01" "2011-01-02" ...
## $ season : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : num [1:731] 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num [1:731] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : num [1:731] 6 0 1 2 3 4 5 6 0 1 ...
## $ workingday: num [1:731] 0 0 1 1 1 1 1 0 0 1 ...
## $ weathersit: num [1:731] 2 2 1 1 1 1 2 2 1 1 ...
## $ temp : num [1:731] 0.344 0.363 0.196 0.2 0.227 ...
## $ atemp : num [1:731] 0.364 0.354 0.189 0.212 0.229 ...
## $ hum : num [1:731] 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num [1:731] 0.16 0.249 0.248 0.16 0.187 ...
## $ casual : num [1:731] 331 131 120 108 82 88 148 68 54 41 ...
## $ registered: num [1:731] 654 670 1229 1454 1518 ...
## $ cnt : num [1:731] 985 801 1349 1562 1600 ...
## - attr(*, "spec")=
## .. cols(
## .. instant = col_double(),
## .. dteday = col_date(format = ""),
## .. season = col_double(),
## .. yr = col_double(),
## .. mnth = col_double(),
## .. holiday = col_double(),
## .. weekday = col_double(),
## .. workingday = col_double(),
## .. weathersit = col_double(),
## .. temp = col_double(),
## .. atemp = col_double(),
## .. hum = col_double(),
## .. windspeed = col_double(),
## .. casual = col_double(),
## .. registered = col_double(),
## .. cnt = col_double()
## .. )
day_data[,"ncnt"] <- day_data[,"cnt"] / max(day_data[,"cnt"])
day_data[,"nr"] <- day_data[,"registered"] / max(day_data[,"registered"])
day_data[,"rr"] <- day_data[,"cnt"] / max(day_data[,"registered"])
summary(day_data)
## instant dteday season yr
## Min. : 1.0 Min. :2011-01-01 Min. :1.000 Min. :0.0000
## 1st Qu.:183.5 1st Qu.:2011-07-02 1st Qu.:2.000 1st Qu.:0.0000
## Median :366.0 Median :2012-01-01 Median :3.000 Median :1.0000
## Mean :366.0 Mean :2012-01-01 Mean :2.497 Mean :0.5007
## 3rd Qu.:548.5 3rd Qu.:2012-07-01 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :731.0 Max. :2012-12-31 Max. :4.000 Max. :1.0000
## mnth holiday weekday workingday
## Min. : 1.00 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
## Median : 7.00 Median :0.00000 Median :3.000 Median :1.000
## Mean : 6.52 Mean :0.02873 Mean :2.997 Mean :0.684
## 3rd Qu.:10.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :12.00 Max. :1.00000 Max. :6.000 Max. :1.000
## weathersit temp atemp hum
## Min. :1.000 Min. :0.05913 Min. :0.07907 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200
## Median :1.000 Median :0.49833 Median :0.48673 Median :0.6267
## Mean :1.395 Mean :0.49538 Mean :0.47435 Mean :0.6279
## 3rd Qu.:2.000 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302
## Max. :3.000 Max. :0.86167 Max. :0.84090 Max. :0.9725
## windspeed casual registered cnt
## Min. :0.02239 Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.:0.13495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median :0.18097 Median : 713.0 Median :3662 Median :4548
## Mean :0.19049 Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:0.23321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :0.50746 Max. :3410.0 Max. :6946 Max. :8714
## ncnt nr rr
## Min. :0.002525 Min. :0.002879 Min. :0.003167
## 1st Qu.:0.361717 1st Qu.:0.359488 1st Qu.:0.453786
## Median :0.521919 Median :0.527210 Median :0.654765
## Mean :0.516909 Mean :0.526371 Mean :0.648481
## 3rd Qu.:0.683498 3rd Qu.:0.687662 3rd Qu.:0.857472
## Max. :1.000000 Max. :1.000000 Max. :1.254535
## Read about the timetk package
# ?timetk
library(plotly)
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(htmlwidgets))
plot_ly(data = day_data, x = ~dteday, y = ~temp, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Temperature vs Date",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Temperature"))
plot_ly(data = day_data, x = ~dteday, y = ~hum, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Humidity vs Date",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Humidity"))
plot_ly(data = day_data, x = ~dteday, y = ~windspeed, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Windspeed vs Date for Day Data",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Windspeed"))
plot_ly(data = day_data, x = ~dteday, y = ~ncnt, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Bike Rentals vs Date for Day Data",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Bike Rentals"))
plot_ly(data = day_data, x = ~dteday, y = ~nr, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Registered Rentals vs Date for Day Data",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Registered Rentals"))
plot_ly(data = day_data, x = ~dteday, y = ~rr, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Ratio of Rentals to Registration vs Date for Day Data",
xaxis = list(title = "Date"),
yaxis = list(title = "Ratio"))
# Ensure that the columns are numeric before applying tsclean
# This step ensures the data is in the correct format before smoothing
day_data[["temp"]] <- as.numeric(day_data[["temp"]])
day_data[["ncnt"]] <- as.numeric(day_data[["ncnt"]])
day_data[["nr"]] <- as.numeric(day_data[["nr"]])
day_data[["rr"]] <- as.numeric(day_data[["rr"]])
# Apply tsclean to smooth the time series data
# tsclean is used to clean and smooth the time series data by removing outliers
day_data[["temp"]] <- tsclean(day_data[["temp"]])
day_data[["ncnt"]] <- tsclean(day_data[["ncnt"]])
day_data[["nr"]] <- tsclean(day_data[["nr"]])
day_data[["rr"]] <- tsclean(day_data[["rr"]])
# View the cleaned data (optional)
# head(day_data)
# Plot the normalized temperature over time, using the 'season' as a color variable
# The 'dteday' column will be used as the x-axis and 'temp' as the y-axis
day_data %>%
plot_ly(x = ~dteday, y = ~temp, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Temperature vs Date for Day Data",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Temperature"))
# Plot normalized bike rentals ('ncnt') over time with smoothing applied
# The 'season' column is used to color the lines for different seasons
day_data %>%
plot_ly(x = ~dteday, y = ~ncnt, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Bike Rentals vs Date for Day Data (Smoothed)",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Bike Rentals"))
# Plot normalized registered rentals ('nr') over time with smoothing
# The 'season' column is used to color the lines for different seasons
day_data %>%
plot_ly(x = ~dteday, y = ~nr, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Normalized Registered Rentals vs Date for Day Data (Smoothed)",
xaxis = list(title = "Date"),
yaxis = list(title = "Normalized Registered Rentals"))
# Plot the ratio of rentals to registration ('rr') over time with smoothing
# The 'season' column is used to color the lines for different seasons
day_data %>%
plot_ly(x = ~dteday, y = ~rr, type = 'scatter', mode = 'lines',
color = ~factor(season)) %>%
layout(title = "Ratio of Rentals to Registration vs Date for Day Data (Smoothed)",
xaxis = list(title = "Date"),
yaxis = list(title = "Ratio"))
# Ensure the columns are numeric
day_data[["temp"]] <- as.numeric(day_data[["temp"]])
day_data[["ncnt"]] <- as.numeric(day_data[["ncnt"]])
day_data[["nr"]] <- as.numeric(day_data[["nr"]])
day_data[["rr"]] <- as.numeric(day_data[["rr"]])
# Augmented Dickey-Fuller Test for 'temp'
temp_ts <- ts(day_data[["temp"]], frequency = 365) # Convert 'temp' to time series
adf_test_temp <- adf.test(temp_ts) # Perform ADF test for 'temp'
print(adf_test_temp)
##
## Augmented Dickey-Fuller Test
##
## data: temp_ts
## Dickey-Fuller = -1.6785, Lag order = 9, p-value = 0.7144
## alternative hypothesis: stationary
# Augmented Dickey-Fuller Test for 'ncnt'
ncnt_ts <- ts(day_data[["ncnt"]], frequency = 365) # Convert 'ncnt' to time series
adf_test_ncnt <- adf.test(ncnt_ts) # Perform ADF test for 'ncnt'
print(adf_test_ncnt)
##
## Augmented Dickey-Fuller Test
##
## data: ncnt_ts
## Dickey-Fuller = -1.3084, Lag order = 9, p-value = 0.871
## alternative hypothesis: stationary
# Augmented Dickey-Fuller Test for 'nr'
nr_ts <- ts(day_data[["nr"]], frequency = 365) # Convert 'nr' to time series
adf_test_nr <- adf.test(nr_ts) # Perform ADF test for 'nr'
print(adf_test_nr)
##
## Augmented Dickey-Fuller Test
##
## data: nr_ts
## Dickey-Fuller = -2.418, Lag order = 9, p-value = 0.4014
## alternative hypothesis: stationary
# Augmented Dickey-Fuller Test for 'rr'
rr_ts <- ts(day_data[["rr"]], frequency = 365) # Convert 'rr' to time series
adf_test_rr <- adf.test(rr_ts) # Perform ADF test for 'rr'
print(adf_test_rr)
##
## Augmented Dickey-Fuller Test
##
## data: rr_ts
## Dickey-Fuller = -1.3084, Lag order = 9, p-value = 0.871
## alternative hypothesis: stationary
# Ensure 'nr' is extracted as a numeric vector and not a list
norm_rentals <- ts(as.numeric(unlist(day_data[,"nr"])), frequency = 365)
# Perform STL decomposition
decomped1 <- stl(norm_rentals, "periodic")
# Plot the stationary component (residuals) of the decomposition
plot(decomped1$time.series[,2],
ylab = "Stationary of the Normalized Rental Reservations",
xlab = "Day of the Year")
# Check residuals (the third component of the decomposition)
checkresiduals(decomped1$time.series[, 3])
##
## Ljung-Box test
##
## data: Residuals
## Q* = 1997.3, df = 146, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 146
# Ensure 'ncnt' is extracted as a numeric vector and not a list
norm_cnt <- ts(as.numeric(unlist(day_data[,"ncnt"])), frequency = 365)
# Perform STL decomposition
decomped2 <- stl(norm_cnt, s.window = "periodic")
# Plot the stationary component (residuals) of the decomposition
plot(decomped2$time.series[,2],
ylab = "Stationary of the Normalized Rental Counts",
xlab = "Day of the Year")
# Check residuals (the third component of the decomposition)
checkresiduals(decomped2$time.series[, 3])
##
## Ljung-Box test
##
## data: Residuals
## Q* = 1437.8, df = 146, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 146
# Ensure 'rr' is extracted as a numeric vector and not a list
norm_rr <- ts(as.numeric(unlist(day_data[,"rr"])), frequency = 365)
# Perform STL decomposition
decomped3 <- stl(norm_rr, s.window = "periodic")
# Plot the stationary component (residuals) of the decomposition
plot(decomped3$time.series[,2],
ylab = "Stationary of the Normalized Rental Counts to Reservations",
xlab = "Day of the Year")
# Check residuals (the third component of the decomposition)
checkresiduals(decomped3$time.series[, 3])
##
## Ljung-Box test
##
## data: Residuals
## Q* = 1437.8, df = 146, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 146
# Checking for Normality of Residuals using Shapiro-Wilk Test
# For Normalized Rental Reservations (decomped1)
print("-------Normalized Rental Reservations")
## [1] "-------Normalized Rental Reservations"
shapiro.test(decomped1$time.series[, 3])
##
## Shapiro-Wilk normality test
##
## data: decomped1$time.series[, 3]
## W = 0.99554, p-value = 0.03334
# For Normalized Count of Rentals (decomped2)
print("-------Normalized Count of Rentals")
## [1] "-------Normalized Count of Rentals"
shapiro.test(decomped2$time.series[, 3])
##
## Shapiro-Wilk normality test
##
## data: decomped2$time.series[, 3]
## W = 0.99702, p-value = 0.1993
# For Normalized Ratio of Rentals to Reservations (decomped3)
print("-------Normalized Ratio of Rentals to Reservations")
## [1] "-------Normalized Ratio of Rentals to Reservations"
shapiro.test(decomped3$time.series[, 3])
##
## Shapiro-Wilk normality test
##
## data: decomped3$time.series[, 3]
## W = 0.99702, p-value = 0.1993
# Fit ARIMA model to normalized bike count data
fit1 <- auto.arima(norm_cnt, seasonal = TRUE)
# Plot the histogram of residuals from the ARIMA model
hist(fit1$residuals,
xlab = "Residual",
ylab = "Distribution",
main = "Histogram of Model Errors - Bike Count")
shapiro.test(fit1$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit1$residuals
## W = 0.83122, p-value < 2.2e-16
# Forecast the next 25 days using the ARIMA model
prediction1 <- forecast(fit1, 30)
# Plot the forecasted bike rental counts
plot(prediction1, xlab = "Date", ylab = "Normalized Count of Rentals", main = "Prediction of Bike Rental Counts")
# Fit the ARIMA model for the normalized rentals
fit2 <- auto.arima(norm_rentals, seasonal = TRUE)
# Plot the histogram of residuals to analyze model errors
hist(fit2$residuals, xlab = "Residual", ylab = "Distribution", main = "Histogram of Model Errors - Rental Count")
shapiro.test(fit2$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit2$residuals
## W = 0.85305, p-value < 2.2e-16
# Generate forecast for the normalized rentals for the next 25 periods
prediction2 <- forecast(fit2, 30)
# Plot the forecasted predictions with confidence intervals
plot(prediction2, xlab = "Date", ylab = "Normalized Rentals", main = "Prediction of Bike Rentals")
# Fit ARIMA model to the normalized bike rental counts
fit2 <- auto.arima(norm_cnt, seasonal = TRUE)
# Plot histogram of residuals (errors) from the model
hist(fit2$residuals, xlab = "Residual", ylab = "Distribution", main = "Histogram of Model Errors - Count to Rental Ratio")
shapiro.test(fit2$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit2$residuals
## W = 0.83122, p-value < 2.2e-16
# Generate forecast for the next 25 periods
prediction3 <- forecast(fit2, 30)
# Plot the forecasted values
plot(prediction3, xlab = "Date", ylab = "Normalized Rental Ratio", main = "Prediction of Bike Rentals to Reservations")